FE8828 Programming Web Applications in Finance

Week 4
Data, visualization, and web: part 3

Dr. Yang Ye <Email:yy@runchee.com>

Nov 23, 2017

Lecture 8: Viz

library(ggplot), author Hadley Wickham. First release on June 10, 2007.

https://qz.com/1007328/all-hail-ggplot2-the-code-powering-all-those-excellent-charts-is-10-years-old/

Why do we need ggplot?

It’s part of the exploration of the data via visualization.

ggplot system

library(ggplot2)
ggplot(airquality, aes(Temp, Ozone)) + 
        geom_point() + 
        geom_smooth(method = "loess", se = FALSE)
## Warning: Removed 37 rows containing non-finite values (stat_smooth).
## Warning: Removed 37 rows containing missing values (geom_point).

Syntax of ggplot

Simply plot x and y

ggplot(bank, aes(age, balance)) + geom_point()

ggplot(bank, aes(age, balance, color = job)) + geom_point()

If you don’t know the column name, use aes_string to pass variable name as string/character.

ggplot(bank, aes_string("age", "balance", color = "job")) + geom_point()

For non-numeric data

ggplot(bank, aes(default, age)) + geom_point()

ggplot(bank, aes(age, default)) + geom_point()

ggplot(bank, aes(job, age)) + geom_point()

Add 2nd geometry

ggplot(bank, aes(age, balance)) + geom_point() + geom_smooth()
## `geom_smooth()` using method = 'gam'

ggplot(bank, aes(age, balance, color = job)) + geom_point() + geom_smooth()
## `geom_smooth()` using method = 'loess'

Pass aes down

ggplot(bank, aes(x = age, y = duration)) +
geom_smooth() +
geom_point()
## `geom_smooth()` using method = 'gam'


# This is equivalent to below
ggplot(bank) +
geom_point(aes(x = age, y = duration)) +
geom_smooth(aes(x = age, y = duration))
## `geom_smooth()` using method = 'gam'


# But we can do specify different data for two geom_smooth()
ggplot(bank) +
geom_point(aes(x = age, y = duration)) +
geom_smooth(data = filter(bank, age > median(age)), aes(x = age, y = duration), color = "green") +
geom_smooth(data = filter(bank, age <= median(age)), aes(x = age, y = duration), color = "red")
## `geom_smooth()` using method = 'gam'
## `geom_smooth()` using method = 'gam'

Adjustment

# adjust legent position
ggplot(bank, aes(x = age, y = duration, color = job)) +
geom_point() +
theme(legend.position="bottom")


ggplot(bank, aes(x = age, y = duration, color = job)) +
geom_point() +
theme(legend.position="left")


# Different feeling?
ggplot(bank, aes(x = age, y = duration, color = job)) +
geom_point() +
theme(legend.position="left") + 
coord_flip()


# Make y as log scaled.
# Note that before flip, x is y, so we use scale_y_log10()
ggplot(bank, aes(x = age, y = duration, color = job)) +
geom_point() +
theme(legend.position="left") + 
coord_flip() +
scale_y_log10()

Each + is a layer

# Nearly empty chart.
g <- ggplot(bank, aes(x = age, y = duration))
g


# This is almost empty
g <- ggplot(bank)
g


# This is really empty.
g <- ggplot()
g

Combine g with layers

ggplot(bank, aes(x = age, y = duration)) + 
  geom_point() + geom_smooth()
## `geom_smooth()` using method = 'gam'


# This is equivalent to above
g <- ggplot(bank, aes(x = age, y = duration))
g + geom_point() + geom_smooth()
## `geom_smooth()` using method = 'gam'

# g can be re-used. It's good to be used when we want to exploratory data. 
# Fixed a few variables in `g <- ggplot(data, aes(...))`.
# Use `g + geom_XXX()` to find the best representation for the relationship.
g + geom_point() + geom_smooth(method = "lm") + facet_grid(. ~ job)

g + geom_point(color = "steelblue", size = 4, alpha = 1/2)

g + geom_point(aes(color = job), size = 4, alpha = 1/2)

g + geom_point() + geom_point(aes(color = job), size = 4, alpha = 1/2) 

Use color/shape/size/alpha/group to differentiate to different groups.

ggplot(bank) +
  geom_point(aes(age, duration, shape = contact))


ggplot(bank) +
  geom_point(aes(age, duration, color = contact))


ggplot(bank) +
  geom_point(aes(age, duration, size = contact))
## Warning: Using size for a discrete variable is not advised.


ggplot(bank) +
  geom_point(aes(age, duration, alpha = contact))


ggplot(bank) +
  geom_point(aes(age, duration, group = contact))

## you can also enforce color, put things outside aes
ggplot(bank) +
  geom_point(aes(age, duration), color = "blue", size = 10)

Things to consider:

Exercise

ggplot(bank, aes(age, job)) + geom_point()


# Reverse a categorical variable, we use rev(levels(...)).
# Reverse a continous numerical variable, we use scale_x_reverse().
ggplot(bank, aes(age, job)) +
geom_point() +
scale_y_discrete(limit = rev(levels(bank$job)))

## Warning: Using size for a discrete variable is not advised.

Other geoms

Boxplot

ggplot(bank, aes(job, duration)) + geom_boxplot()

ggplot(bank, aes(job, age)) + geom_boxplot()

Density

ggplot(bank, aes(balance, color = job)) + geom_density()

ggplot(bank, aes(duration, fill = job)) + geom_density()

ggplot(bank, aes(age, color = job, alpha = 0.3)) + geom_density()

# Which is better?
ggplot(bank, aes(age, color = job, fill = job, alpha = 0.3)) + geom_density()

histogram

ggplot(data = bank, mapping = aes(x = duration, fill = job)) + geom_histogram(binwidth = 2)

ggplot(data = bank, mapping = aes(x = duration, fill = job)) + geom_histogram(binwidth = 100)

ggplot(data = bank, mapping = aes(x = age, fill = job)) + geom_histogram(binwidth = 10)

ggplot(data = bank, mapping = aes(x = age, colour = job)) + geom_freqpoly(binwidth = 10)

geom_bar: bar is a statistical function: it counts.

# first input parameter to geom_bar is mapping, so we can skip it.
ggplot(bank) + geom_bar(mapping = aes(x = age))

# in short, we skip mapping
ggplot(bank) + geom_bar(aes(x = age))


# comparing to colour, for Bar, we better use fill
# ggplot(data = bank, ) + geom_bar(aes(x = age, colour = job))
ggplot(bank) + geom_bar(mapping = aes(x = age, fill = job))

ggplot(bank) +
  geom_bar(mapping = aes(x = job))

# Color doesn't work, because age is a continous variable.
ggplot(bank) +
  geom_bar(mapping = aes(x = job, fill = age)) 

Position for bar

ggplot(bank) + geom_bar(mapping = aes(x = age, fill = job))

# fill to 100%
ggplot(bank) + geom_bar(mapping = aes(x = age, fill = job), position = "fill")

# dodge means "adaptive width of the bar"
ggplot(bank) + geom_bar(mapping = aes(x = age, fill = job), position = "dodge")

Variations

ggplot(bank) +
geom_bar(mapping = aes(x = age, fill = job), position = "fill") +
coord_flip()

ggplot(bank) +
geom_bar(mapping = aes(x = age, fill = job), position = "fill") +
coord_flip() +
scale_x_reverse()

ggplot(bank) +
geom_bar(mapping = aes(x = age, fill = job), position = "fill") +
coord_polar()

geom_bar: better serves for categorical data

ggplot(data = bank, mapping = aes(x = job, fill = education)) + geom_bar()

ggplot(data = bank, mapping = aes(x = job, fill = education)) + geom_bar() + coord_flip()

ggplot(data = bank, mapping = aes(x = reorder(job, age, FUN = mean), fill = education)) +
  geom_bar() +
  coord_flip()

ggplot(data = bank, mapping = aes(x = reorder(job, age, FUN = mean), fill = education)) +
    geom_bar() +
    coord_flip()

# If we just to order job according to alphabetical order.
# use rev(levels(...))
ggplot(data = bank, mapping = aes(x = reorder(job, age, FUN = median), fill = education)) +
  geom_bar() +
  scale_x_discrete(limit = rev(levels(bank$job))) +
  coord_flip()

Bar with composite data

# If we want to sort the job acccording to median age
# And also add age range and median age.
ggplot(data = bank, mapping = aes(x = reorder(job, age, FUN = median), fill = education)) +
  geom_bar() +
  scale_x_discrete(limit = rev(levels(reorder(bank$job, bank$age, FUN = median)))) +
  geom_line(aes(x = job, y = age)) +
  geom_point(data = group_by(bank, job) %>% summarize(age = median(age)) %>% ungroup, aes(x = job, y = age), inherit.aes = FALSE) +
  xlab("Job sorted according to\nMedian age\n(Top - younger)") +
  coord_flip()

Data with statistical

ggplot(data = bank) + 
  stat_summary(
    mapping = aes(x = age, y = balance),
    fun.ymin = min,
    fun.ymax = max,
    fun.y = median
  )

Facets

# If just want to
ggplot(data = bank) + 
  geom_point(mapping = aes(x = age, y = duration)) + 
  facet_wrap(~ education, nrow = 2)

Facets - finding the best

# doesn't look great because we have so many jobs.
ggplot(bank, aes(pdays)) + geom_histogram() + facet_grid(job ~ .)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.


# Not a good choice, neither
ggplot(bank, aes(pdays)) + geom_histogram() + facet_grid(. ~ job)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.


# Can we do better?
ggplot(bank, aes(campaign)) + geom_histogram() + facet_grid(. ~ job)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.


ggplot(bank, aes(duration)) + geom_histogram(aes(color = job)) + facet_grid(. ~ job)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.


# facet with points is good
ggplot(bank, aes(balance, age)) + geom_point() + facet_grid(. ~ job)

# do better
ggplot(bank, aes(balance, age)) + geom_point(aes(color = job)) + facet_grid(. ~ job)


# Can we apply points between age and balance?
ggplot(bank, aes(age, balance, color = job)) + geom_point() + geom_smooth() + facet_grid(. ~ job)
## `geom_smooth()` using method = 'loess'


# Smooth line is mixed with points
ggplot(bank, aes(age, balance)) + geom_point(aes(color = job)) + geom_smooth() + facet_grid(. ~ job)
## `geom_smooth()` using method = 'loess'

With facets or without facets?

ggplot(bank, aes(previous)) + geom_histogram() + facet_grid(. ~ job)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# v.s.
ggplot(bank, aes(previous)) + geom_histogram(aes(fill = job)) + facet_grid(. ~ job)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# v.s.
ggplot(bank, aes(previous)) + geom_density(aes(fill = job))

With facets or without facets - Case 2?

ggplot(bank, aes(previous)) + geom_histogram() + facet_grid(. ~ marital)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# v.s.
ggplot(bank, aes(previous)) + geom_density(aes(fill = marital))

# v.s.
ggplot(bank, aes(previous)) + geom_density(aes(fill = marital), alpha = 0.7) + xlim(1, 10)
## Warning: Removed 3725 rows containing non-finite values (stat_density).

Facets in the full power.

# Levels gives more control to the layer and style.
cutpoints <- quantile(bank$age, seq(0, 1, length = 4), na.rm = TRUE)
# The age_group variable is now a categorical factor variable containing 3 levels, indicating the ranges of age.
bank$age_group <- cut(bank$age, cutpoints)
levels(bank$age_group)
## [1] "(19,35]" "(35,45]" "(45,87]"
# Use facet_wrap to specify nrow/ncol.
ggplot(bank, aes(age, duration)) +
  geom_point(alpha = 1/3) + 
  facet_wrap(job ~ age_group, nrow = 2) + # ncol = number of cuts 3 = length(levels(bank$age_group))
  geom_smooth(method="lm", se=FALSE, col="steelblue") + 
  theme_bw(base_family = "Avenir", base_size = 10) + 
  labs(x = "age", y = expression("log " * Duration)) + 
  scale_y_log10() +
  labs(title = "Bank Clients")  
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x
## $y, : font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x
## $y, : font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x
## $y, : font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x
## $y, : font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x
## $y, : font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x
## $y, : font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x
## $y, : font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x
## $y, : font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x
## $y, : font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x
## $y, : font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x
## $y, : font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x
## $y, : font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x
## $y, : font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x
## $y, : font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x
## $y, : font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x
## $y, : font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x
## $y, : font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

Add theme

default theme is theme_gray()

g <- ggplot(bank, aes(x = age, y = log10(duration)))
g + geom_point(aes(color = job), size = 4, alpha = 1/2) + theme_bw()

g + geom_point(aes(color = job), size = 4, alpha = 1/2) + theme_void()


g + geom_point(aes(color = job), size = 4, alpha = 1/2) + theme_minimal() +
    labs(title = "Duration is longer with age",
        subtitle = "some random plot",
        caption = "from MFE") +
    labs(x = "age", y = expression("log " * Duration))

ggthemes

library(ggthemes)
## [1] "theme_base" "theme_calc"
## [3] "theme_economist" "theme_economist_white"
## [5] "theme_excel" "theme_few"
## [7] "theme_fivethirtyeight" "theme_foundation"
## [9] "theme_gdocs" "theme_hc"
## [11] "theme_igray" "theme_map"
## [13] "theme_pander" "theme_par"
## [15] "theme_solarized" "theme_solarized_2"
## [17] "theme_solid" "theme_stata"
## [19] "theme_tufte" "theme_wsj"

ggplot summary

ggplot(data = ) + ( mapping = aes(), stat = , position = ) + +

ggplot: one more thing.

Assignment

Lecture 9: Shiny

Display output with render*() functions

render* - Allow binding of one output to multiple inputs

output$hist <- renderPlot({
  hist(data())
})

output$stat <- renderPlot({
  summary(data())
})

observeEvent

When codes gets to run.

renderUI

Create dynamic output

# shiny-34-renderUI.R

library(shiny)

ui <- fluidPage(
  uiOutput("p1")
)

server <- function(input, output, session) {
  output$p1 <- renderUI({
    tagList(
      h1("HTML t1"),
      uiOutput("t1"),
      h1("Plot p1"),
      plotOutput("p1")
    )
  })
}

shinyApp(ui, server)

Create dynamic output 2

You can use newly created UI immeidately

# shiny-34-renderUI.R

library(shiny)

ui <- fluidPage(
  uiOutput("p1")
)

server <- function(input, output, session) {
  output$p1 <- renderUI({
    tl <- tagList(
      h1("HTML t1"),
      uiOutput("t1"),
      h1("Plot p1p1"),
      plotOutput("p1p1")
    )

    tl
  })
  
  output$t1 <- renderUI({
    tagList(
      h1("HTML p1t1 inside t1"),
      plotOutput("p1t1")
    )
  })

  output$p1t1 <- renderPlot({
    # hist(runif(10000))
    plot(1:100, runif(100))
  })
  
  output$p1p1 <- renderPlot({
    plot(1:100, runif(100))
  })
  
  
}

shinyApp(ui, server)

Create dynamic output 3

library(shiny)
library(knitr)
library(kableExtra)

ui <- fluidPage(
  numericInput("num", "Num", 3),
  uiOutput("p1"),
  hr(),
  tableOutput("p2")
)

server <- function(input, output, session) {
  observe({
    row_num <- input$num

    output$p1 <- renderUI({
      tagList(
        tags$h1("This is a header"),
        {
          if (row_num > 0 & row_num < 7) {
            hx <- paste0("h", row_num)
            (tags[[hx]])(toupper(hx))
          } else {
            (tags[["h6"]])(toupper("h6"))
          }
        },
        numericInput("num_plot", "Give a number", value = round(runif(1, min = 0, max = nrow(iris)), 0), min = 0, max = nrow(iris)),
        plotOutput("plot"),
        
        tags$h3("kable can't be used with tagList."),
        kable(iris[1:row_num, , drop = T], format = "html")
      )
    })
    
    # num_plot is the newly created input.
    # plot is the newly created output.
    # You can use the newly created input/output immediately
    # This is particularly useful for creating multiple plots and tables.
    output$plot <- renderPlot({
      if (input$num_plot > 0) {
        ggplot(iris[1:input$num_plot, , drop = F], aes(x = Sepal.Length, y = Petal.Width)) +
          geom_point() +
          geom_smooth() +
          theme_minimal()
      }
    })

    # Use anything together with kable, use function() { paste0(...) }
    output$p2 <- function() {
      paste0(
        tags$h1("kable is used inside a function()"),
        kable(iris[1:row_num, , drop = T], format = "html"))
    }
  })
}

shinyApp(ui, server)

Create dynamic input 4

uiOutput(“h1”) output$h1 <- renderUI({ tagList( sliderInput(“n”, “N”, 1, 1000, 500), textInput(“label”, “Label”) ) })

Update various input values

updateSelectionInput(…) updateNumericInput(…)

Dynamic input and update***Input

library(shiny)

ui <- fluidPage(
  uiOutput("p1"),
  verbatimTextOutput("o1")
)

scenarios <- c(-100, -50, 0, 50, 100)

server <- function(input, output, session) {
  output$p1 <- renderUI({
    tagList(
      numericInput("shock", "Shock", value = round(runif(1) * 1000), 0),
      actionButton("add", "Add"),
      checkboxGroupInput("scenarios", "Scenarios", choices = c(), selected = c())
    )
  })
  
  updateCheckboxGroupInput(session, "scenarios",
                           choices = scenarios,
                           selected = scenarios)  

  observeEvent(input$add, {
    shock <- isolate(input$shock)
    if (!(shock %in% scenarios)) {
      scenarios <<- sort(c(scenarios, shock))
      updateCheckboxGroupInput(session, "scenarios",
                               choices = scenarios,
                               selected = scenarios)
    }
    updateNumericInput(session, "shock", value = round(runif(1) * 1000))
  })
  
  output$o1 <- renderPrint({
    x <- input$scenarios
    str(x)
    cat(paste0("length: ", length(x), "\n"))
    cat(paste0(x, "\n"))
  })
}

shinyApp(ui, server)

ggplot/gridExtra

If we need to generate multiple plots. ggplot has a companion package to arrange plots.

SxS: side by side

library(gridExtra)
p1 <- ggplot(bank) + geom_bar(mapping = aes(x = age, fill = job), position = "fill") + coord_polar()
p2 <- ggplot(bank) + geom_bar(mapping = aes(x = age, fill = education), position = "fill") + coord_polar()
grid.arrange(p1, p2, ncol=2, nrow=1)

grid.arrange(p1, p2, ncol=2, nrow=1, widths = c(4,2))

grid.arrange(p1, p2, ncol=1, nrow=2, heights = c(4,2))

ggplot/gridExtra

a bit more complicated

library(tibble)
library(ggplot2)
library(gridExtra)

df <- tibble(x = rnorm(1000), y = rnorm(1000))

hist_top <- ggplot(df, aes(x = x)) + geom_density()

empty <-
  ggplot()+geom_point(aes(1,1), colour="white")+
  theme(axis.ticks=element_blank(), 
        panel.background=element_blank(), 
        axis.text.x=element_blank(), axis.text.y=element_blank(),
        axis.title.x=element_blank(), axis.title.y=element_blank())

scatter <- ggplot(df, aes(x = x, y = y)) + geom_point()

hist_right <- ggplot(df, aes(x = y)) + geom_density() + coord_flip()

grid.arrange(hist_top, empty, scatter, hist_right, ncol=2, nrow=2, widths=c(3.5, 0.7), heights=c(1, 4))

knitr/kableExtra

kable is provided by knitr package. kableExtra enhance it with more functions. So we load both packages.

```{r shiny_block}
library(knitr)
library(kableExtra)

# This is HTML output
kable(df, format = "html")

# Use function() { } to output html
output$p1 <- function() {
  kable(df, format = "html")
}
```

kable_styling

mtcars[1:10, , drop = F] %>%
kable("html") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                font_size = 12,
                full_width = F, # True for left-to-right width
                position = "left") # if full_width == F
mpg cyl disp hp drat wt qsec vs am gear carb
Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
mtcars[1:10, , drop = F] %>%
kable("html") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                font_size = 12,
                full_width = F, # True for left-to-right width
                position = "left") %>% # if full_width == F
    column_spec(1, bold = T, border_right = T) %>%
    column_spec(2, width = "30em", background = "yellow")
mpg cyl disp hp drat wt qsec vs am gear carb
Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
mtcars[1:10, , drop = F] %>%
kable("html") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                font_size = 12,
                full_width = F, # True for left-to-right width
                position = "left") %>% # if full_width == F
    column_spec(5:7, bold = T) %>%
    row_spec(3:5, bold = T, color = "white", background = "#D7261E")    
mpg cyl disp hp drat wt qsec vs am gear carb
Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4

vol_surface <- tibble(tenor = c("1M", "2M", "3M", "6M"),
                      `0.1` = c(0.472, 0.435, 0.391, 0.29),
                      `0.25` = c(0.431, 0.41, 0.337, 0.28),
                      `0.5` = c(0.398, 0.30, 0.251, 0.2),
                      `0.75` = c(0.428, 0.336, 0.307, 0.249),
                      `0.9` = c(0.457, 0.411, 0.391, 0.278))

# "magma" (or "A"), "inferno" (or "B"), "plasma" (or "C"), and "viridis" (or "D", the default option).
library(knitr)
library(kableExtra)
gather(vol_surface, key = "delta", value = "vol", -tenor) %>%
# cell_spec takes column vol. spec_color also takes column vol values into consideration.
# We take half of the spectrurm - yellow to red.
mutate(vol = cell_spec(
    vol, "html", color = "black", bold = T,
    background = spec_color(vol, begin = 0.5, end = 1, option = "A", direction = -1))) %>%
spread(key = "delta", value = "vol") %>%
  kable("html", escape = F, align = "c") %>%
  kable_styling("striped", full_width = F)
tenor 0.1 0.25 0.5 0.75 0.9
1M 0.472 0.431 0.398 0.428 0.457
2M 0.435 0.41 0.3 0.336 0.411
3M 0.391 0.337 0.251 0.307 0.391
6M 0.29 0.28 0.2 0.249 0.278
mtcars[1:10, 1:2] %>%
  mutate(
    car = row.names(.),
    mpg = cell_spec(mpg, "html", color = ifelse(mpg > 20, "red", "blue")),
    cyl = cell_spec(cyl, "html", color = "white", align = "c", angle = 45, 
                    background = factor(cyl, c(4, 6, 8), 
                                        c("#666666", "#999999", "#BBBBBB")))
  ) %>%
  select(car, mpg, cyl) %>%
  kable("html", escape = F) %>%
  kable_styling("striped", full_width = F)

volatility surface.

Include all cells for colors, using gather, cell_spec, then spread

iris[1:10, ] %>% mutate_if(is.numeric, function(x) { cell_spec(x, “html”, bold = T, color = spec_color(x, end = 0.9), font_size = spec_font_size(x)) }) %>% mutate(Species = )) %>% kable(“html”, escape = F, align = “c”) %>% kable_styling(“striped”, full_width = F)

Lecture 10: Model

model

models <- mtcars %>% split(.$cyl) %>% map(function(df) lm(mpg ~ wt, data = df))

models <- mtcars %>% split(.$cyl) %>% map(~lm(mpg ~ wt, data = .))

model with Modelr

library(modelr)

mod <- lm(log(balance) ~ log(age), data = bank)

bank1 <- filter(bank, default == “no” & balance > 0) mod <- lm(log(balance) ~ log(age), data = bank1)

bank2 <- bank1 %>% add_residuals(mod) %>% mutate(resid = exp(resid))

ggplot(data = bank2) + geom_point(mapping = aes(x = age, y = resid))

First, you define a family of models that express a precise, but generic, pattern that you want to capture. For example, the pattern might be a straight line, or a quadatric curve. You will express the model family as an equation like y = a_1 * x + a_2 or y = a_1 * x ^ a_2. Here, x and y are known variables from your data, and a_1 and a_2 are parameters that can vary to capture different patterns.

Next, you generate a fitted model by finding the model from the family that is the closest to your data. This takes the generic model family and makes it specific, like y = 3 * x + 7 or y = 9 * x ^ 2.

install.package(“purrr”, “modelr”) library(purrr) library(modelr)

Data set

sim1

model1 <- function(a, data) { a[1] + data$x * a[2] }

measure_distance <- function(mod, data) { diff <- data\(y - model1(mod, data) sqrt(mean(diff ^ 2)) } best <- optim(c(0, 0), measure_distance, data = sim1) best\)par

sim1_mod <- lm(y ~ x, data = sim1) coef(sim1_mod)

prediction is for the new data

sim1 %>% data_grid(x) %>% # generate data set. add_predictions(sim1_mod)

ggplot(sim1, aes(x)) + geom_point(aes(y = y)) + geom_line(aes(y = pred), data = grid, colour = “red”, size = 1)

residual is for the existing data

add_residuals(sim1_mod, sim1)

ggplot(sim1, aes(resid)) + geom_freqpoly(binwidth = 0.5)

apply it for the bank dataset

ggplot(bank) + geom_bar(aes(x = age, fill = y))

model_matrix(bank, y ~ age)

bank_mod <- lm(y ~ age, data = mutate(bank, y = ifelse(y == “yes”, 1, 0)))

add_prediction

mutate(bank, y = ifelse(y == “yes”, 1, 0)) %>% data_grid(age) %>% add_predictions(bank_mod) mutate(bank, y = ifelse(y == “yes”, 1, 0)) %>% data_grid(age) %>% add_predictions(bank_mod) %>% ggplot(aes(x = age, y = pred)) + geom_point()

mutate(bank, y = ifelse(y == “yes”, 1, 0)) %>% add_residuals(bank_mod) %>% ggplot(aes(resid)) + geom_freqpoly(binwidth = 0.05)

2nd model

bank_mod <- lm(y ~ age * job, data = mutate(bank, y = ifelse(y == “yes”, 1, 0))) mutate(bank, y = ifelse(y == “yes”, 1, 0)) %>% data_grid(age, job) %>% add_predictions(bank_mod) %>% ggplot(aes(x = age, colour = job)) + geom_line(aes(y = pred))

mutate(bank, y = ifelse(y == “yes”, 1, 0)) %>% add_residuals(bank_mod) %>% ggplot(aes(x = age, resid, colour = job)) + geom_point()

model

model_matrix(df, y ~ I(x^2) + x)

try age with balance?

with different jobs

by_job <- group_by(bank, job) %>% nest() job_balance <- function(df) { lm(balance ~ age, data = df) } models <- mutate(by_job, model = purrr::map(data, job_balance))

by_job_res <- models %>% mutate(resids = map2(data, model, add_residuals))

resids <- unnest(by_job_res, resids)

resids %>% ggplot(aes(age, resid)) + geom_line(aes(colour = job)) + geom_smooth(se = FALSE)

resids %>% ggplot(aes(age, resid, group = job)) + geom_line(alpha = 1 / 3) + facet_wrap(~job)

natural spine

mod1 <- lm(y ~ ns(x, 1), data = sim5)

mod1 <- lm(y ~ x1 + x2, data = sim3) mod2 <- lm(y ~ x1 * x2, data = sim3)

Regression Model

library(ggplot2)

set.seed(123)
N <- 1000
x <- rnorm(N)
f <- function(x) 50*x^2/(1 + 4*x) # data-simulating function

y <- f(x) + rnorm(N, sd=3)

point_data <- data.frame(x, y)

library(tidyverse)

ggplot(point_data, aes(x=x, y=y)) + 
  geom_point() + 
  ylim(-100, 100) + 
  ggtitle("simulated data points")
## Warning: Removed 3 rows containing missing values (geom_point).


fit_pade <- function(point_data){
  fit <- lm(y ~ x + I(x^2) + I(y*x) + I(y*x^2), point_data)
  lm_coef <- as.list(coef(fit))
  names(lm_coef) <- c("a0", paste0(rep(c('a','b'), each=2), 1:2))
  
  with(lm_coef, function(x)(a0 + a1*x + a2*x^2)/(1 - b1*x - b2*x^2))
}

plot_fitted_function <- function(x_data, fitted_fun, title){
  x_data$y_hat <- fitted_fun(x_data$x)
  g <- ggplot(x_data, aes(x=x, y=y)) + 
    geom_point() + ylim(-100, 100) +
    geom_line(aes(y=y_hat), col="red", size=1) + 
    ggtitle(title)
  
  plot(g)
}

pade_approx <- fit_pade(point_data)

plot_fitted_function(point_data, pade_approx, title="fitted function")
## Warning: Removed 3 rows containing missing values (geom_point).


function_list <- list(
  function(x) (100 - 50*x - 100*x^2)/(1 - 50*x - 5*x^2),
  function(x) (100 - 50*x - 100*x^2)/(1 - 10*x - 5*x^2),
  function(x) (100 - 50*x - 100*x^2)/(1 - 10*x - 10*x^2)
)

for (f in function_list){
  sim_data <- point_data %>% mutate(y=f(x) + 
                                      rnorm(nrow(point_data), sd=3))
  plot_fitted_function(sim_data, fit_pade(sim_data), 
                       title=as.character(deparse(f))[2])
}
## Warning: Removed 14 rows containing missing values (geom_point).

## Warning: Removed 96 rows containing missing values (geom_point).

## Warning: Removed 86 rows containing missing values (geom_point).

K-means

K-means between two categorical variables

dev.off() data.model <- select(bank, balance, education) %>% mutate(education = factor(education)) plot(data.model) combination <- model.matrix(~ . - 1, data.model) data.model\(cl <- kmeans(combination, 4)\)cluster with(data.model, plot(balance, education, col = cl))

K-means between one categorical variable + one numeric variable

dev.off() data.model <- select(bank, job, education) %>% mutate(job = factor(job), education = factor(education)) plot(data.model) combination <- model.matrix(~ . - 1, data.model) data.model\(cl <- kmeans(combination, 2)\)cluster with(data.model, plot(job, education, col = cl))

K-means between two numeric variables

dev.off() data.model <- select(bank, balance, duration) plot(data.model) data.model\(cl <- kmeans(data.model[, 1:2], 4)\)cluster with(data.model, plot(balance, duration, col = cl)) with(data.model, text(balance, duration, col=cl))

draw the

library(xts) xts object => store prices and retrieve.

library(fOption)

Option valuation.

How to value a portfolio of stocks?

How to value a portfolio of options?

How to simulate portfolio gain.

How to do SVD analysis?

Object way and Data frame way

sample option portfolio sample stock portfolio